/*
 * This file contains the core of a bitslice DES implementation for x86-64/SSE2.
 * It is part of John the Ripper password cracker,
 * Copyright (c) 2000-2001,2005,2006,2008,2011,2012,2015,2019 by Solar Designer
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted.  (This is a heavily cut-down "BSD license".)
 *
 * Gate counts per S-box: 49 44 46 33 48 46 46 41
 * Average: 44.125
 *
 * The Boolean expressions corresponding to DES S-boxes have been generated
 * by Roman Rusakov <roman_rus at openwall.com> for use in Openwall's
 * John the Ripper password cracker: https://www.openwall.com/john/
 * Being mathematical formulas, they are not copyrighted and are free for reuse
 * by anyone.
 *
 * The x86-64/SSE2 code for the S-boxes was generated by Solar Designer using a
 * Perl script.  The script performed various optimizations, including the
 * x86-64 specific optimization of preferring registers 0-7 over 8-15 to reduce
 * the number of instruction prefixes (and thus code size).  The instruction
 * scheduling has been tuned for Core 2.
 *
 * The effort has been sponsored by Rapid7: https://www.rapid7.com
 */

#include "arch.h"

#if DES_BS_ASM

#ifdef UNDERSCORES
#define DES_bs_all			_DES_bs_all
#define DES_bs_init_asm			_DES_bs_init_asm
#define DES_bs_crypt			_DES_bs_crypt
#define DES_bs_crypt_25			_DES_bs_crypt_25
#define DES_bs_crypt_LM			_DES_bs_crypt_LM
#endif

#ifdef ALIGN_LOG
#define DO_ALIGN(log)			.align log
#else
#define DO_ALIGN(log)			.align 1 << log
#endif

#ifdef __sun
/* Sun's assembler doesn't recognize .space */
#define DO_SPACE(size)			.zero size
#else
/* Mac OS X assembler doesn't recognize .zero */
#define DO_SPACE(size)			.space size
#endif

/* Sun's assembler can't multiply, but at least it can add... */
#define nptr(n)				n+n+n+n+n+n+n+n
#define nvec(n)				n+n+n+n+n+n+n+n+n+n+n+n+n+n+n+n

#ifdef BSD
.data
#else
.bss
#endif

.globl DES_bs_all
DO_ALIGN(6)
DES_bs_all:
DES_bs_all_KSp:
DO_SPACE(nptr(0x300))
DES_bs_all_KS_p:
DES_bs_all_KS_v:
DO_SPACE(nvec(0x300))
DES_bs_all_E:
DO_SPACE(nptr(96))
DES_bs_all_K:
DO_SPACE(nvec(56))
DES_bs_all_B:
DO_SPACE(nvec(64))
DES_bs_all_tmp:
DO_SPACE(nvec(16))
DES_bs_all_xkeys:
DO_SPACE(nvec(64))
DES_bs_all_pxkeys:
DO_SPACE(nptr(128))
DES_bs_all_keys_changed:
DO_SPACE(4)
DES_bs_all_salt:
DO_SPACE(4)
DES_bs_all_Ens:
DO_SPACE(nptr(48))

#define E(i)				DES_bs_all_E+nptr(i)(%rip)
#define B(i)				DES_bs_all_B+nvec(i)(%rip)
#define tmp_at(i)			DES_bs_all_tmp+nvec(i)(%rip)

#define pnot				tmp_at(0)

#define a1				%xmm0
#define a2				%xmm1
#define a3				%xmm2
#define a4				%xmm3
#define a5				%xmm4
#define a6				%xmm5

#define S1(out1, out2, out3, out4) \
	movdqa %xmm4,%xmm7; \
	movdqa %xmm5,%xmm10; \
	pandn %xmm0,%xmm4; \
	movdqa %xmm2,%xmm13; \
	movdqa %xmm4,%xmm14; \
	por %xmm2,%xmm10; \
	movdqa %xmm5,%xmm11; \
	pxor %xmm0,%xmm13; \
	pxor %xmm7,%xmm11; \
	pxor %xmm3,%xmm14; \
	movdqa %xmm13,%xmm12; \
	movdqa %xmm11,%xmm15; \
	pand %xmm10,%xmm13; \
	movdqa %xmm14,%xmm9; \
	movdqa %xmm13,%xmm8; \
	pxor %xmm2,%xmm15; \
	pxor %xmm3,%xmm8; \
	pandn %xmm11,%xmm12; \
	pandn %xmm8,%xmm9; \
	por %xmm5,%xmm13; \
	por %xmm0,%xmm5; \
	pandn %xmm7,%xmm8; \
	pandn %xmm14,%xmm15; \
	movdqa %xmm5,%xmm6; \
	pxor %xmm15,%xmm13; \
	movdqa %xmm9,%xmm15; \
	por %xmm13,%xmm6; \
	pandn %xmm3,%xmm5; \
	movdqa %xmm8,%xmm3; \
	pandn %xmm13,%xmm15; \
	pxor %xmm6,%xmm8; \
	pxor %xmm3,%xmm5; \
	pand %xmm10,%xmm13; \
	pandn %xmm2,%xmm4; \
	movdqa %xmm6,%xmm2; \
	pxor %xmm10,%xmm6; \
	pxor %xmm14,%xmm2; \
	pandn %xmm2,%xmm4; \
	movdqa %xmm4,%xmm2; \
	pxor pnot,%xmm2; \
	pxor %xmm11,%xmm4; \
	pxor %xmm2,%xmm13; \
	movdqa %xmm1,%xmm2; \
	por %xmm3,%xmm4; \
	pandn %xmm8,%xmm2; \
	por %xmm7,%xmm14; \
	pxor %xmm10,%xmm4; \
	por %xmm1,%xmm9; \
	pxor %xmm13,%xmm2; \
	pxor %xmm0,%xmm4; \
	movdqa %xmm1,%xmm0; \
	pxor %xmm4,%xmm13; \
	pxor %xmm13,%xmm9; \
	por %xmm12,%xmm5; \
	pxor out1,%xmm9; \
	por %xmm5,%xmm6; \
	por %xmm11,%xmm13; \
	pxor %xmm4,%xmm6; \
	movdqa %xmm9,out1; \
	por %xmm15,%xmm0; \
	pxor %xmm6,%xmm13; \
	pxor out3,%xmm2; \
	pxor out2,%xmm13; \
	pand %xmm15,%xmm4; \
	pandn %xmm14,%xmm6; \
	pxor %xmm0,%xmm13; \
	pxor %xmm6,%xmm4; \
	movdqa %xmm2,out3; \
	por %xmm1,%xmm4; \
	pxor %xmm5,%xmm4; \
	movdqa %xmm13,out2; \
	pxor out4,%xmm4; \
	movdqa %xmm4,out4

#define S2(out1, out2, out3, out4) \
	movdqa %xmm4,%xmm13; \
	movdqa %xmm5,%xmm6; \
	pxor %xmm1,%xmm13; \
	movdqa %xmm5,%xmm8; \
	pandn %xmm0,%xmm6; \
	movdqa %xmm13,%xmm7; \
	pandn %xmm4,%xmm6; \
	movdqa %xmm5,%xmm9; \
	movdqa %xmm6,%xmm14; \
	pandn %xmm13,%xmm8; \
	pand %xmm0,%xmm7; \
	pxor pnot,%xmm0; \
	por %xmm1,%xmm14; \
	movdqa %xmm8,%xmm12; \
	pxor %xmm4,%xmm7; \
	pand %xmm2,%xmm9; \
	pxor %xmm5,%xmm13; \
	pxor %xmm8,%xmm6; \
	movdqa %xmm9,%xmm10; \
	pand %xmm14,%xmm6; \
	pandn %xmm6,%xmm10; \
	pand %xmm2,%xmm6; \
	pandn %xmm7,%xmm12; \
	pxor %xmm6,%xmm0; \
	movdqa %xmm9,%xmm5; \
	pandn %xmm3,%xmm10; \
	pandn %xmm13,%xmm5; \
	movdqa %xmm5,%xmm11; \
	pandn %xmm1,%xmm5; \
	pxor %xmm0,%xmm11; \
	pxor %xmm13,%xmm2; \
	por %xmm3,%xmm12; \
	pxor %xmm5,%xmm7; \
	movdqa %xmm7,%xmm1; \
	pxor out2,%xmm10; \
	pandn %xmm0,%xmm1; \
	movdqa %xmm3,%xmm0; \
	pxor %xmm2,%xmm1; \
	pandn %xmm14,%xmm0; \
	pxor %xmm11,%xmm14; \
	pxor %xmm5,%xmm6; \
	pxor %xmm1,%xmm0; \
	por %xmm6,%xmm2; \
	por %xmm14,%xmm9; \
	pxor %xmm1,%xmm6; \
	pxor %xmm11,%xmm10; \
	pand %xmm9,%xmm6; \
	pxor out3,%xmm2; \
	pxor %xmm4,%xmm6; \
	pandn %xmm6,%xmm8; \
	pxor %xmm9,%xmm2; \
	pxor %xmm11,%xmm8; \
	por %xmm8,%xmm3; \
	por %xmm13,%xmm14; \
	pxor %xmm3,%xmm2; \
	pandn %xmm8,%xmm7; \
	movdqa %xmm2,out3; \
	pxor out4,%xmm7; \
	movdqa %xmm10,out2; \
	pxor %xmm14,%xmm7; \
	pxor out1,%xmm0; \
	pxor %xmm12,%xmm7; \
	movdqa %xmm0,out1; \
	movdqa %xmm7,out4

#define S3(out1, out2, out3, out4) \
	movdqa %xmm1,%xmm6; \
	movdqa %xmm5,%xmm13; \
	pandn %xmm0,%xmm6; \
	movdqa %xmm5,%xmm8; \
	pxor %xmm2,%xmm13; \
	movdqa %xmm0,%xmm11; \
	por %xmm13,%xmm6; \
	movdqa %xmm13,%xmm9; \
	pxor %xmm3,%xmm8; \
	movdqa %xmm3,%xmm15; \
	pandn %xmm8,%xmm11; \
	pxor %xmm1,%xmm9; \
	movdqa %xmm11,%xmm10; \
	movdqa %xmm4,%xmm12; \
	movdqa %xmm5,%xmm14; \
	pxor %xmm6,%xmm10; \
	pandn %xmm9,%xmm14; \
	movdqa %xmm10,%xmm7; \
	pxor %xmm14,%xmm6; \
	movdqa %xmm6,%xmm14; \
	pand %xmm5,%xmm7; \
	pand %xmm3,%xmm5; \
	pandn %xmm10,%xmm14; \
	por %xmm3,%xmm7; \
	pandn %xmm10,%xmm12; \
	pand %xmm0,%xmm7; \
	pxor out4,%xmm12; \
	pxor %xmm9,%xmm7; \
	pand %xmm13,%xmm8; \
	pxor %xmm0,%xmm15; \
	pxor %xmm7,%xmm12; \
	pxor %xmm15,%xmm6; \
	pand %xmm3,%xmm13; \
	por %xmm2,%xmm6; \
	por %xmm11,%xmm15; \
	pandn %xmm6,%xmm8; \
	movdqa %xmm15,%xmm6; \
	pand %xmm4,%xmm8; \
	pandn %xmm7,%xmm6; \
	movdqa %xmm1,%xmm7; \
	pandn %xmm10,%xmm1; \
	pandn %xmm5,%xmm7; \
	por %xmm9,%xmm5; \
	pxor %xmm7,%xmm6; \
	movdqa %xmm2,%xmm7; \
	pxor %xmm9,%xmm15; \
	pandn %xmm6,%xmm7; \
	pandn %xmm1,%xmm2; \
	pandn %xmm5,%xmm7; \
	pxor pnot,%xmm15; \
	pxor out2,%xmm7; \
	pxor %xmm2,%xmm15; \
	pandn %xmm4,%xmm14; \
	movdqa %xmm12,out4; \
	por %xmm15,%xmm9; \
	pxor %xmm0,%xmm7; \
	pandn %xmm9,%xmm13; \
	por %xmm11,%xmm1; \
	pxor %xmm8,%xmm7; \
	pxor %xmm1,%xmm13; \
	por %xmm4,%xmm6; \
	pxor out1,%xmm14; \
	pxor %xmm13,%xmm6; \
	pxor %xmm15,%xmm14; \
	pxor out3,%xmm6; \
	movdqa %xmm7,out2; \
	movdqa %xmm14,out1; \
	movdqa %xmm6,out3

#define S4(out1, out2, out3, out4) \
	movdqa %xmm3,%xmm7; \
	movdqa %xmm1,%xmm8; \
	pxor %xmm2,%xmm0; \
	pxor %xmm4,%xmm2; \
	por %xmm1,%xmm3; \
	pandn %xmm2,%xmm1; \
	pxor %xmm4,%xmm3; \
	movdqa %xmm1,%xmm10; \
	pxor %xmm7,%xmm1; \
	pandn %xmm2,%xmm3; \
	movdqa %xmm1,%xmm11; \
	movdqa %xmm3,%xmm6; \
	pxor %xmm8,%xmm7; \
	por %xmm0,%xmm1; \
	pandn %xmm1,%xmm3; \
	movdqa %xmm3,%xmm1; \
	movdqa %xmm5,%xmm12; \
	pxor %xmm8,%xmm3; \
	pand %xmm3,%xmm11; \
	movdqa %xmm11,%xmm9; \
	por %xmm4,%xmm10; \
	pxor %xmm3,%xmm0; \
	pandn %xmm2,%xmm11; \
	pandn %xmm0,%xmm11; \
	pxor %xmm0,%xmm10; \
	movdqa %xmm7,%xmm0; \
	pxor %xmm11,%xmm6; \
	movdqa %xmm6,%xmm4; \
	pandn %xmm5,%xmm6; \
	pandn %xmm10,%xmm7; \
	pxor out1,%xmm6; \
	pandn %xmm4,%xmm5; \
	pxor %xmm1,%xmm7; \
	pxor %xmm7,%xmm6; \
	pxor pnot,%xmm7; \
	pxor %xmm7,%xmm5; \
	pxor %xmm4,%xmm7; \
	pxor out2,%xmm5; \
	movdqa %xmm5,out2; \
	pandn %xmm7,%xmm0; \
	movdqa %xmm12,%xmm7; \
	por %xmm9,%xmm0; \
	movdqa %xmm6,out1; \
	pxor %xmm10,%xmm0; \
	por %xmm3,%xmm12; \
	pxor %xmm0,%xmm12; \
	pxor out4,%xmm0; \
	pand %xmm7,%xmm3; \
	pxor out3,%xmm12; \
	movdqa %xmm12,out3; \
	pxor %xmm3,%xmm0; \
	movdqa %xmm0,out4

#define S5(out1, out2, out3, out4) \
	movdqa %xmm2,%xmm6; \
	por %xmm0,%xmm2; \
	movdqa %xmm5,%xmm7; \
	pandn %xmm2,%xmm5; \
	movdqa %xmm3,%xmm14; \
	pandn %xmm5,%xmm3; \
	pxor %xmm0,%xmm5; \
	pxor %xmm6,%xmm3; \
	movdqa %xmm5,%xmm15; \
	pxor %xmm6,%xmm5; \
	movdqa %xmm3,%xmm10; \
	pand %xmm4,%xmm3; \
	movdqa %xmm5,%xmm8; \
	por %xmm0,%xmm5; \
	pxor %xmm14,%xmm3; \
	pxor %xmm5,%xmm3; \
	movdqa %xmm5,%xmm12; \
	por %xmm14,%xmm8; \
	pxor %xmm0,%xmm2; \
	pxor %xmm3,%xmm7; \
	pand %xmm14,%xmm12; \
	movdqa %xmm7,%xmm9; \
	por %xmm15,%xmm7; \
	pxor %xmm15,%xmm12; \
	pandn %xmm7,%xmm0; \
	pand %xmm4,%xmm7; \
	pxor %xmm8,%xmm4; \
	pxor %xmm7,%xmm12; \
	movdqa %xmm0,%xmm6; \
	pxor %xmm4,%xmm0; \
	pxor %xmm10,%xmm6; \
	movdqa %xmm1,%xmm13; \
	pandn %xmm4,%xmm6; \
	pand %xmm10,%xmm5; \
	pxor pnot,%xmm6; \
	por %xmm12,%xmm0; \
	pandn %xmm6,%xmm13; \
	movdqa %xmm7,%xmm6; \
	pandn %xmm10,%xmm7; \
	pxor %xmm8,%xmm10; \
	pandn %xmm0,%xmm7; \
	pxor %xmm13,%xmm3; \
	pand %xmm7,%xmm9; \
	movdqa %xmm7,%xmm0; \
	pxor %xmm4,%xmm9; \
	pandn %xmm8,%xmm0; \
	pand %xmm1,%xmm8; \
	por %xmm1,%xmm0; \
	pand %xmm9,%xmm14; \
	pxor %xmm2,%xmm7; \
	por %xmm9,%xmm5; \
	pxor %xmm14,%xmm7; \
	pxor %xmm6,%xmm5; \
	pxor %xmm7,%xmm0; \
	pxor %xmm15,%xmm9; \
	pandn %xmm10,%xmm7; \
	pand %xmm1,%xmm5; \
	pxor %xmm9,%xmm7; \
	pxor %xmm12,%xmm5; \
	pxor %xmm8,%xmm7; \
	pxor out3,%xmm3; \
	pxor out4,%xmm5; \
	pxor out1,%xmm0; \
	pxor out2,%xmm7; \
	movdqa %xmm3,out3; \
	movdqa %xmm5,out4; \
	movdqa %xmm0,out1; \
	movdqa %xmm7,out2

#define S6(out1, out2, out3, out4) \
	movdqa %xmm5,%xmm8; \
	por %xmm1,%xmm5; \
	movdqa %xmm4,%xmm7; \
	movdqa %xmm4,tmp_at(2); \
	movdqa %xmm2,%xmm11; \
	pxor %xmm1,%xmm4; \
	pand %xmm0,%xmm5; \
	movdqa %xmm3,%xmm15; \
	pxor %xmm5,%xmm4; \
	movdqa %xmm4,%xmm9; \
	pxor %xmm0,%xmm11; \
	pxor %xmm8,%xmm4; \
	movdqa %xmm0,tmp_at(1); \
	movdqa %xmm4,%xmm12; \
	pand %xmm0,%xmm4; \
	movdqa %xmm11,%xmm0; \
	pandn %xmm7,%xmm12; \
	movdqa %xmm4,%xmm10; \
	pxor %xmm1,%xmm4; \
	por %xmm1,%xmm11; \
	por %xmm4,%xmm0; \
	movdqa %xmm0,%xmm6; \
	por %xmm12,%xmm4; \
	pxor %xmm9,%xmm0; \
	pxor %xmm1,%xmm6; \
	movdqa %xmm4,%xmm14; \
	movdqa %xmm6,%xmm7; \
	pandn %xmm8,%xmm6; \
	pxor %xmm8,%xmm10; \
	pxor %xmm2,%xmm6; \
	pand %xmm0,%xmm2; \
	movdqa %xmm3,%xmm1; \
	pandn %xmm2,%xmm8; \
	movdqa %xmm2,%xmm13; \
	pxor %xmm8,%xmm14; \
	pxor %xmm11,%xmm7; \
	pand %xmm14,%xmm15; \
	pandn tmp_at(2),%xmm2; \
	pxor %xmm0,%xmm15; \
	por tmp_at(1),%xmm0; \
	pxor out4,%xmm15; \
	pand %xmm4,%xmm0; \
	por %xmm6,%xmm2; \
	pxor %xmm6,%xmm0; \
	pxor pnot,%xmm7; \
	pandn %xmm0,%xmm8; \
	pxor %xmm9,%xmm0; \
	por %xmm3,%xmm12; \
	pandn tmp_at(2),%xmm0; \
	por %xmm2,%xmm5; \
	pxor %xmm7,%xmm0; \
	pxor tmp_at(1),%xmm6; \
	pandn %xmm0,%xmm1; \
	pandn %xmm2,%xmm3; \
	pand %xmm10,%xmm6; \
	pxor %xmm3,%xmm7; \
	pxor out3,%xmm8; \
	pxor %xmm6,%xmm7; \
	pxor %xmm12,%xmm8; \
	pxor %xmm1,%xmm5; \
	pxor %xmm13,%xmm7; \
	pxor %xmm11,%xmm14; \
	pxor out2,%xmm5; \
	pxor %xmm14,%xmm5; \
	movdqa %xmm15,out4; \
	pxor out1,%xmm7; \
	movdqa %xmm8,out3; \
	movdqa %xmm5,out2; \
	movdqa %xmm7,out1

#define S7(out1, out2, out3, out4) \
	movdqa %xmm4,%xmm14; \
	pxor %xmm3,%xmm4; \
	movdqa %xmm3,%xmm11; \
	movdqa %xmm4,%xmm12; \
	pand %xmm4,%xmm11; \
	pxor %xmm2,%xmm4; \
	movdqa %xmm11,%xmm6; \
	movdqa %xmm4,%xmm7; \
	movdqa %xmm11,%xmm15; \
	pand %xmm5,%xmm6; \
	pxor %xmm1,%xmm11; \
	movdqa %xmm7,%xmm13; \
	pand %xmm5,%xmm4; \
	movdqa %xmm11,%xmm10; \
	pxor %xmm5,%xmm12; \
	pxor %xmm2,%xmm6; \
	movdqa %xmm6,%xmm8; \
	por %xmm10,%xmm6; \
	pand %xmm4,%xmm11; \
	pandn %xmm0,%xmm11; \
	pxor %xmm12,%xmm6; \
	pxor %xmm4,%xmm8; \
	pandn %xmm14,%xmm7; \
	movdqa %xmm7,%xmm9; \
	pxor %xmm6,%xmm11; \
	pxor %xmm12,%xmm4; \
	por %xmm10,%xmm7; \
	pxor %xmm8,%xmm7; \
	pandn %xmm3,%xmm4; \
	pxor %xmm14,%xmm8; \
	pandn %xmm10,%xmm4; \
	pxor %xmm4,%xmm8; \
	pandn %xmm13,%xmm12; \
	pand %xmm8,%xmm2; \
	por %xmm15,%xmm6; \
	por %xmm2,%xmm6; \
	pxor %xmm12,%xmm6; \
	movdqa %xmm0,%xmm3; \
	pandn %xmm6,%xmm0; \
	movdqa %xmm6,%xmm4; \
	por %xmm8,%xmm6; \
	pand %xmm5,%xmm6; \
	pxor %xmm7,%xmm0; \
	por %xmm14,%xmm2; \
	pand %xmm6,%xmm1; \
	pxor %xmm4,%xmm7; \
	pxor %xmm6,%xmm2; \
	pxor %xmm7,%xmm1; \
	pxor %xmm14,%xmm7; \
	movdqa %xmm3,%xmm5; \
	por %xmm2,%xmm7; \
	pxor out1,%xmm0; \
	pand %xmm7,%xmm3; \
	pxor pnot,%xmm4; \
	pxor %xmm6,%xmm7; \
	por %xmm9,%xmm7; \
	pxor out4,%xmm11; \
	pxor %xmm3,%xmm8; \
	pxor %xmm4,%xmm7; \
	pandn %xmm7,%xmm5; \
	movdqa %xmm11,out4; \
	pxor out2,%xmm1; \
	movdqa %xmm0,out1; \
	pxor %xmm5,%xmm1; \
	pxor out3,%xmm8; \
	movdqa %xmm8,out3; \
	movdqa %xmm1,out2

#define S8(out1, out2, out3, out4) \
	movdqa %xmm1,%xmm13; \
	pandn %xmm2,%xmm1; \
	movdqa %xmm2,%xmm11; \
	movdqa %xmm2,%xmm8; \
	pandn %xmm4,%xmm2; \
	movdqa %xmm1,%xmm6; \
	pxor %xmm3,%xmm2; \
	pandn %xmm13,%xmm11; \
	movdqa %xmm2,%xmm9; \
	pand %xmm0,%xmm2; \
	movdqa %xmm9,%xmm7; \
	pandn %xmm2,%xmm1; \
	pandn %xmm13,%xmm9; \
	pxor %xmm4,%xmm11; \
	movdqa %xmm9,%xmm12; \
	por %xmm0,%xmm9; \
	movdqa %xmm11,%xmm10; \
	pand %xmm9,%xmm11; \
	pxor pnot,%xmm7; \
	por %xmm11,%xmm2; \
	pxor %xmm11,%xmm7; \
	pandn %xmm8,%xmm9; \
	movdqa %xmm5,%xmm15; \
	pxor %xmm9,%xmm7; \
	por %xmm1,%xmm15; \
	pxor %xmm7,%xmm6; \
	pxor %xmm6,%xmm15; \
	pxor %xmm0,%xmm6; \
	movdqa %xmm6,%xmm14; \
	pxor %xmm13,%xmm7; \
	pand %xmm4,%xmm6; \
	pxor out2,%xmm15; \
	pxor %xmm7,%xmm6; \
	pxor %xmm6,%xmm12; \
	movdqa %xmm15,out2; \
	pxor %xmm2,%xmm6; \
	pxor %xmm4,%xmm14; \
	por %xmm13,%xmm6; \
	pand %xmm5,%xmm2; \
	por %xmm3,%xmm7; \
	pxor %xmm12,%xmm10; \
	pxor %xmm10,%xmm7; \
	pxor %xmm14,%xmm6; \
	pxor %xmm6,%xmm2; \
	pxor %xmm7,%xmm0; \
	pandn %xmm10,%xmm3; \
	pand %xmm5,%xmm0; \
	pand %xmm3,%xmm6; \
	pxor out3,%xmm2; \
	pxor %xmm6,%xmm7; \
	pxor %xmm1,%xmm7; \
	pxor out4,%xmm0; \
	movdqa %xmm2,out3; \
	por %xmm7,%xmm5; \
	pxor out1,%xmm5; \
	pxor %xmm12,%xmm0; \
	pxor %xmm12,%xmm5; \
	movdqa %xmm0,out4; \
	movdqa %xmm5,out1

#define zero				%xmm5

#define DES_bs_clear_block_8(i) \
	movdqa zero,B(i); \
	movdqa zero,B(i + 1); \
	movdqa zero,B(i + 2); \
	movdqa zero,B(i + 3); \
	movdqa zero,B(i + 4); \
	movdqa zero,B(i + 5); \
	movdqa zero,B(i + 6); \
	movdqa zero,B(i + 7)

#define DES_bs_clear_block \
	DES_bs_clear_block_8(0); \
	DES_bs_clear_block_8(8); \
	DES_bs_clear_block_8(16); \
	DES_bs_clear_block_8(24); \
	DES_bs_clear_block_8(32); \
	DES_bs_clear_block_8(40); \
	DES_bs_clear_block_8(48); \
	DES_bs_clear_block_8(56)

#define k_ptr				%rdx
#define K(i)				nvec(i)(k_ptr)
#define k(i)				nptr(i)(k_ptr)

#define tmp1				%rcx
#define tmp2				%rsi

#define xor_E(i) \
	movq E(i),tmp1; \
	movdqa K(i),a1; \
	movq E(i + 1),tmp2; \
	movdqa K(i + 1),a2; \
	pxor (tmp1),a1; \
	pxor (tmp2),a2; \
	movq E(i + 2),tmp1; \
	movdqa K(i + 2),a3; \
	movq E(i + 3),tmp2; \
	movdqa K(i + 3),a4; \
	pxor (tmp1),a3; \
	pxor (tmp2),a4; \
	movq E(i + 4),tmp1; \
	movdqa K(i + 4),a5; \
	movq E(i + 5),tmp2; \
	movdqa K(i + 5),a6; \
	pxor (tmp1),a5; \
	pxor (tmp2),a6

#define xor_B(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \
	movdqa B(b1),a1; \
	movdqa B(b2),a2; \
	pxor K(k1),a1; \
	movdqa B(b3),a3; \
	pxor K(k2),a2; \
	movdqa B(b4),a4; \
	pxor K(k3),a3; \
	movdqa B(b5),a5; \
	pxor K(k4),a4; \
	movdqa B(b6),a6; \
	pxor K(k5),a5; \
	pxor K(k6),a6

#define xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6) \
	movq k(k1),tmp1; \
	movq k(k2),tmp2; \
	movdqa B(b1),a1; \
	movdqa B(b2),a2; \
	pxor (tmp1),a1; \
	movq k(k3),tmp1; \
	pxor (tmp2),a2; \
	movq k(k4),tmp2; \
	movdqa B(b3),a3; \
	movdqa B(b4),a4; \
	pxor (tmp1),a3; \
	movq k(k6),tmp1; \
	pxor (tmp2),a4

#define xor_B_KS_p_suffix(b5, k5) \
	movq k(k5),tmp2; \
	movdqa B(b5),a5; \
	pxor (tmp1),a6; \
	pxor (tmp2),a5

#define xor_B_KS_p(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, b6, k6) \
	xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6); \
	movdqa B(b6),a6; \
	xor_B_KS_p_suffix(b5, k5)

#define xor_B_KS_p_special(b1, k1, b2, k2, b3, k3, b4, k4, b5, k5, k6) \
	xor_B_KS_p_prefix(b1, k1, b2, k2, b3, k3, b4, k4, k6); \
	xor_B_KS_p_suffix(b5, k5)

#define mask01				tmp_at(8)
#define mask02				tmp_at(9)
#define mask04				tmp_at(10)
#define mask08				tmp_at(11)
#define mask10				tmp_at(12)
#define mask20				tmp_at(13)
#define mask40				tmp_at(14)
#define mask80				tmp_at(15)

#define v_ptr				%rax
#define V(i)				nvec(i)(v_ptr)

#if 1
#define SHLB1(reg)			paddb reg,reg
#else
#define SHLB1(reg)			psllq $1,reg
#endif

#define FINALIZE_NEXT_KEY_BITS_0_6 \
	movdqa V(0),%xmm0; \
	movdqa V(1),%xmm1; \
	movdqa V(2),%xmm2; \
	movdqa V(3),%xmm3; \
	pand %xmm7,%xmm0; \
	pand %xmm7,%xmm1; \
	pand %xmm7,%xmm2; \
	pand %xmm7,%xmm3; \
	SHLB1(%xmm1); \
	psllq $2,%xmm2; \
	psllq $3,%xmm3; \
	por %xmm0,%xmm1; \
	por %xmm2,%xmm3; \
	movdqa V(4),%xmm4; \
	movdqa V(5),%xmm5; \
	por %xmm1,%xmm3; \
	pand %xmm7,%xmm4; \
	pand %xmm7,%xmm5; \
	movdqa V(6),%xmm6; \
	movdqa V(7),%xmm0; \
	psllq $4,%xmm4; \
	pand %xmm7,%xmm6; \
	pand %xmm7,%xmm0; \
	psllq $5,%xmm5; \
	psllq $6,%xmm6; \
	psllq $7,%xmm0; \
	por %xmm4,%xmm5; \
	por %xmm6,%xmm3; \
	por %xmm5,%xmm0; \
	movdqa V(1),%xmm1; \
	por %xmm3,%xmm0; \
	movdqa V(2),%xmm2; \
	movdqa %xmm0,K(0); \
\
	movdqa V(0),%xmm0; \
	movdqa V(3),%xmm3; \
	pand %xmm8,%xmm1; \
	pand %xmm8,%xmm2; \
	pand %xmm8,%xmm0; \
	pand %xmm8,%xmm3; \
	psrlq $1,%xmm0; \
	SHLB1(%xmm2); \
	psllq $2,%xmm3; \
	por %xmm0,%xmm1; \
	por %xmm2,%xmm3; \
	movdqa V(4),%xmm4; \
	movdqa V(5),%xmm5; \
	por %xmm1,%xmm3; \
	pand %xmm8,%xmm4; \
	pand %xmm8,%xmm5; \
	movdqa V(6),%xmm6; \
	movdqa V(7),%xmm0; \
	psllq $3,%xmm4; \
	pand %xmm8,%xmm6; \
	pand %xmm8,%xmm0; \
	psllq $4,%xmm5; \
	psllq $5,%xmm6; \
	psllq $6,%xmm0; \
	por %xmm4,%xmm5; \
	por %xmm6,%xmm3; \
	por %xmm5,%xmm0; \
	movdqa V(1),%xmm1; \
	por %xmm3,%xmm0; \
	movdqa V(2),%xmm2; \
	movdqa %xmm0,K(1); \
\
	movdqa V(0),%xmm0; \
	movdqa V(3),%xmm3; \
	pand %xmm9,%xmm1; \
	pand %xmm9,%xmm2; \
	pand %xmm9,%xmm0; \
	pand %xmm9,%xmm3; \
	psrlq $1,%xmm1; \
	psrlq $2,%xmm0; \
	SHLB1(%xmm3); \
	por %xmm0,%xmm1; \
	por %xmm2,%xmm3; \
	movdqa V(4),%xmm4; \
	movdqa V(5),%xmm5; \
	por %xmm1,%xmm3; \
	pand %xmm9,%xmm4; \
	pand %xmm9,%xmm5; \
	movdqa V(6),%xmm6; \
	movdqa V(7),%xmm0; \
	psllq $2,%xmm4; \
	pand %xmm9,%xmm6; \
	pand %xmm9,%xmm0; \
	psllq $3,%xmm5; \
	psllq $4,%xmm6; \
	psllq $5,%xmm0; \
	por %xmm4,%xmm5; \
	por %xmm6,%xmm3; \
	por %xmm5,%xmm0; \
	movdqa V(1),%xmm1; \
	por %xmm3,%xmm0; \
	movdqa V(2),%xmm2; \
	movdqa %xmm0,K(2); \
\
	movdqa V(0),%xmm0; \
	movdqa V(3),%xmm3; \
	pand %xmm10,%xmm1; \
	pand %xmm10,%xmm2; \
	pand %xmm10,%xmm0; \
	pand %xmm10,%xmm3; \
	psrlq $2,%xmm1; \
	psrlq $3,%xmm0; \
	psrlq $1,%xmm2; \
	por %xmm0,%xmm1; \
	por %xmm2,%xmm3; \
	movdqa V(4),%xmm4; \
	movdqa V(5),%xmm5; \
	por %xmm1,%xmm3; \
	pand %xmm10,%xmm4; \
	pand %xmm10,%xmm5; \
	movdqa V(6),%xmm6; \
	movdqa V(7),%xmm0; \
	SHLB1(%xmm4); \
	pand %xmm10,%xmm6; \
	pand %xmm10,%xmm0; \
	psllq $2,%xmm5; \
	psllq $3,%xmm6; \
	psllq $4,%xmm0; \
	por %xmm4,%xmm5; \
	por %xmm6,%xmm3; \
	por %xmm5,%xmm0; \
	movdqa V(1),%xmm1; \
	por %xmm3,%xmm0; \
	movdqa V(2),%xmm2; \
	movdqa %xmm0,K(3); \
\
	movdqa V(0),%xmm0; \
	movdqa V(3),%xmm3; \
	pand %xmm11,%xmm1; \
	pand %xmm11,%xmm2; \
	pand %xmm11,%xmm0; \
	pand %xmm11,%xmm3; \
	psrlq $3,%xmm1; \
	psrlq $4,%xmm0; \
	psrlq $2,%xmm2; \
	psrlq $1,%xmm3; \
	por %xmm0,%xmm1; \
	por %xmm2,%xmm3; \
	movdqa V(4),%xmm4; \
	movdqa V(5),%xmm5; \
	por %xmm1,%xmm3; \
	pand %xmm11,%xmm4; \
	pand %xmm11,%xmm5; \
	movdqa V(6),%xmm6; \
	movdqa V(7),%xmm0; \
	pand %xmm11,%xmm6; \
	pand %xmm11,%xmm0; \
	SHLB1(%xmm5); \
	psllq $2,%xmm6; \
	psllq $3,%xmm0; \
	por %xmm4,%xmm5; \
	por %xmm6,%xmm3; \
	por %xmm5,%xmm0; \
	movdqa V(1),%xmm1; \
	por %xmm3,%xmm0; \
	movdqa V(2),%xmm2; \
	movdqa %xmm0,K(4); \
\
	movdqa V(0),%xmm0; \
	movdqa V(3),%xmm3; \
	pand %xmm12,%xmm1; \
	pand %xmm12,%xmm2; \
	pand %xmm12,%xmm0; \
	pand %xmm12,%xmm3; \
	psrlq $4,%xmm1; \
	psrlq $5,%xmm0; \
	psrlq $3,%xmm2; \
	psrlq $2,%xmm3; \
	por %xmm0,%xmm1; \
	por %xmm2,%xmm3; \
	movdqa V(4),%xmm4; \
	movdqa V(5),%xmm5; \
	por %xmm1,%xmm3; \
	pand %xmm12,%xmm4; \
	pand %xmm12,%xmm5; \
	movdqa V(6),%xmm6; \
	movdqa V(7),%xmm0; \
	psrlq $1,%xmm4; \
	pand %xmm12,%xmm6; \
	pand %xmm12,%xmm0; \
	SHLB1(%xmm6); \
	psllq $2,%xmm0; \
	por %xmm4,%xmm5; \
	por %xmm6,%xmm3; \
	por %xmm5,%xmm0; \
	movdqa V(1),%xmm1; \
	por %xmm3,%xmm0; \
	movdqa V(2),%xmm2; \
	movdqa %xmm0,K(5); \
\
	movdqa V(0),%xmm0; \
	movdqa V(3),%xmm3; \
	pand %xmm13,%xmm1; \
	pand %xmm13,%xmm2; \
	pand %xmm13,%xmm0; \
	pand %xmm13,%xmm3; \
	psrlq $5,%xmm1; \
	psrlq $6,%xmm0; \
	psrlq $4,%xmm2; \
	psrlq $3,%xmm3; \
	por %xmm0,%xmm1; \
	por %xmm2,%xmm3; \
	movdqa V(4),%xmm4; \
	movdqa V(5),%xmm5; \
	por %xmm1,%xmm3; \
	pand %xmm13,%xmm4; \
	pand %xmm13,%xmm5; \
	movdqa V(6),%xmm6; \
	movdqa V(7),%xmm0; \
	psrlq $2,%xmm4; \
	pand %xmm13,%xmm6; \
	pand %xmm13,%xmm0; \
	psrlq $1,%xmm5; \
	SHLB1(%xmm0); \
	por %xmm4,%xmm5; \
	por %xmm6,%xmm3; \
	por %xmm5,%xmm0; \
	por %xmm3,%xmm0; \
	movdqa %xmm0,K(6)

.text

DO_ALIGN(6)
.globl DES_bs_init_asm
DES_bs_init_asm:
	pcmpeqd %xmm0,%xmm0
	movdqa %xmm0,pnot
	paddb %xmm0,%xmm0
	pxor pnot,%xmm0
	movdqa %xmm0,mask01
	SHLB1(%xmm0)
	movdqa %xmm0,mask02
	SHLB1(%xmm0)
	movdqa %xmm0,mask04
	SHLB1(%xmm0)
	movdqa %xmm0,mask08
	SHLB1(%xmm0)
	movdqa %xmm0,mask10
	SHLB1(%xmm0)
	movdqa %xmm0,mask20
	SHLB1(%xmm0)
	movdqa %xmm0,mask40
	SHLB1(%xmm0)
	movdqa %xmm0,mask80
	ret

#define iterations			%edi
#define rounds_and_swapped		%eax

DO_ALIGN(6)
.globl DES_bs_crypt
DES_bs_crypt:
	cmpl $0,DES_bs_all_keys_changed(%rip)
	jz DES_bs_crypt_body
	pushq %rdi
	call DES_bs_finalize_keys
	popq %rdi
DES_bs_crypt_body:
	pxor zero,zero
	leaq DES_bs_all_KS_v(%rip),k_ptr
	DES_bs_clear_block
	movl $8,rounds_and_swapped
DES_bs_crypt_start:
	xor_E(0)
	S1(B(40), B(48), B(54), B(62))
	xor_E(6)
	S2(B(44), B(59), B(33), B(49))
	xor_E(12)
	S3(B(55), B(47), B(61), B(37))
	xor_E(18)
	S4(B(57), B(51), B(41), B(32))
	xor_E(24)
	S5(B(39), B(45), B(56), B(34))
	xor_E(30)
	S6(B(35), B(60), B(42), B(50))
	xor_E(36)
	S7(B(63), B(43), B(53), B(38))
	xor_E(42)
	S8(B(36), B(58), B(46), B(52))
	cmpl $0x100,rounds_and_swapped
	je DES_bs_crypt_next
DES_bs_crypt_swap:
	xor_E(48)
	S1(B(8), B(16), B(22), B(30))
	xor_E(54)
	S2(B(12), B(27), B(1), B(17))
	xor_E(60)
	S3(B(23), B(15), B(29), B(5))
	xor_E(66)
	S4(B(25), B(19), B(9), B(0))
	xor_E(72)
	S5(B(7), B(13), B(24), B(2))
	xor_E(78)
	S6(B(3), B(28), B(10), B(18))
	xor_E(84)
	S7(B(31), B(11), B(21), B(6))
	xor_E(90)
	addq $nvec(96),k_ptr
	S8(B(4), B(26), B(14), B(20))
	subl $1,rounds_and_swapped
	jnz DES_bs_crypt_start
	subq $nvec(0x300+48),k_ptr
	movl $0x108,rounds_and_swapped
	subl $1,iterations
	jnz DES_bs_crypt_swap
	ret
DES_bs_crypt_next:
	subq $nvec(0x300-48),k_ptr
	movl $8,rounds_and_swapped
	subl $1,iterations
	jnz DES_bs_crypt_start
	ret

DO_ALIGN(6)
.globl DES_bs_crypt_25
DES_bs_crypt_25:
	cmpl $0,DES_bs_all_keys_changed(%rip)
	jnz DES_bs_finalize_keys_25
DES_bs_crypt_25_body:
	pxor zero,zero
	leaq DES_bs_all_KS_v(%rip),k_ptr
	DES_bs_clear_block
	movl $8,rounds_and_swapped
	movl $25,iterations
DES_bs_crypt_25_start:
	xor_E(0)
	S1(B(40), B(48), B(54), B(62))
	xor_E(6)
	S2(B(44), B(59), B(33), B(49))
	xor_B(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
	S3(B(55), B(47), B(61), B(37))
	xor_B(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
	S4(B(57), B(51), B(41), B(32))
	xor_E(24)
	S5(B(39), B(45), B(56), B(34))
	xor_E(30)
	S6(B(35), B(60), B(42), B(50))
	xor_B(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
	S7(B(63), B(43), B(53), B(38))
	xor_B(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
	S8(B(36), B(58), B(46), B(52))
	cmpl $0x100,rounds_and_swapped
	je DES_bs_crypt_25_next
DES_bs_crypt_25_swap:
	xor_E(48)
	S1(B(8), B(16), B(22), B(30))
	xor_E(54)
	S2(B(12), B(27), B(1), B(17))
	xor_B(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
	S3(B(23), B(15), B(29), B(5))
	xor_B(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
	S4(B(25), B(19), B(9), B(0))
	xor_E(72)
	S5(B(7), B(13), B(24), B(2))
	xor_E(78)
	S6(B(3), B(28), B(10), B(18))
	xor_B(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
	S7(B(31), B(11), B(21), B(6))
	xor_B(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
	S8(B(4), B(26), B(14), B(20))
	addq $nvec(96),k_ptr
	subl $1,rounds_and_swapped
	jnz DES_bs_crypt_25_start
	subq $nvec(0x300+48),k_ptr
	movl $0x108,rounds_and_swapped
	subl $1,iterations
	jnz DES_bs_crypt_25_swap
	ret
DES_bs_crypt_25_next:
	subq $nvec(0x300-48),k_ptr
	movl $8,rounds_and_swapped
	subl $1,iterations
	jmp DES_bs_crypt_25_start

DES_bs_finalize_keys_25:
	leaq DES_bs_crypt_25_body(%rip),tmp1
	pushq tmp1
DES_bs_finalize_keys:
	movdqa mask01,%xmm7
	movdqa mask02,%xmm8
	leaq DES_bs_all_xkeys(%rip),v_ptr
	movdqa mask04,%xmm9
	movdqa mask08,%xmm10
	leaq DES_bs_all_K(%rip),k_ptr
	movl $8,iterations
	movdqa mask10,%xmm11
	movdqa mask20,%xmm12
	movl $0,DES_bs_all_keys_changed(%rip)
	movdqa mask40,%xmm13
DES_bs_finalize_keys_main_loop:
	FINALIZE_NEXT_KEY_BITS_0_6
	addq $nvec(7),k_ptr
	addq $nvec(8),v_ptr
	subl $1,iterations
	jnz DES_bs_finalize_keys_main_loop
	leaq DES_bs_all_KSp(%rip),k_ptr
	leaq DES_bs_all_KS_v(%rip),v_ptr
	movl $0x60,iterations
DES_bs_finalize_keys_expand_loop:
	movq k(0),tmp1
	movq k(1),tmp2
	movdqa (tmp1),%xmm0
	movdqa (tmp2),%xmm1
	movq k(2),tmp1
	movq k(3),tmp2
	movdqa %xmm0,V(0)
	movdqa %xmm1,V(1)
	movdqa (tmp1),%xmm0
	movdqa (tmp2),%xmm1
	movq k(4),tmp1
	movq k(5),tmp2
	movdqa %xmm0,V(2)
	movdqa %xmm1,V(3)
	movdqa (tmp1),%xmm0
	movdqa (tmp2),%xmm1
	movq k(6),tmp1
	movq k(7),tmp2
	movdqa %xmm0,V(4)
	movdqa %xmm1,V(5)
	movdqa (tmp1),%xmm0
	movdqa (tmp2),%xmm1
	addq $nptr(8),k_ptr
	movdqa %xmm0,V(6)
	movdqa %xmm1,V(7)
	addq $nvec(8),v_ptr
	subl $1,iterations
	jnz DES_bs_finalize_keys_expand_loop
	ret

#define ones				%xmm1

#define rounds				%eax

DO_ALIGN(6)
.globl DES_bs_crypt_LM
DES_bs_crypt_LM:
	movl (%rdi),%r8d
	movdqa mask01,%xmm7
	movdqa mask02,%xmm8
	leaq DES_bs_all_xkeys(%rip),v_ptr
	movdqa mask04,%xmm9
	movdqa mask08,%xmm10
	leaq DES_bs_all_K(%rip),k_ptr
	movdqa mask10,%xmm11
	movdqa mask20,%xmm12
	movl $7,iterations
	movdqa mask40,%xmm13
	movdqa mask80,%xmm14
DES_bs_finalize_keys_LM_loop:
	FINALIZE_NEXT_KEY_BITS_0_6
# bit 7
	movdqa V(0),%xmm0
	movdqa V(1),%xmm1
	movdqa V(2),%xmm2
	movdqa V(3),%xmm3
	pand %xmm14,%xmm0
	pand %xmm14,%xmm1
	pand %xmm14,%xmm2
	pand %xmm14,%xmm3
	psrlq $7,%xmm0
	psrlq $6,%xmm1
	psrlq $5,%xmm2
	psrlq $4,%xmm3
	por %xmm0,%xmm1
	por %xmm2,%xmm3
	movdqa V(4),%xmm4
	movdqa V(5),%xmm5
	por %xmm1,%xmm3
	pand %xmm14,%xmm4
	pand %xmm14,%xmm5
	movdqa V(6),%xmm6
	movdqa V(7),%xmm0
	psrlq $3,%xmm4
	pand %xmm14,%xmm6
	pand %xmm14,%xmm0
	psrlq $2,%xmm5
	psrlq $1,%xmm6
	por %xmm4,%xmm5
	por %xmm6,%xmm3
	por %xmm5,%xmm0
	addq $nvec(8),v_ptr
	por %xmm3,%xmm0
	movdqa %xmm0,K(7)
	addq $nvec(8),k_ptr
	subl $1,iterations
	jnz DES_bs_finalize_keys_LM_loop

	pxor zero,zero
	pcmpeqd ones,ones
	leaq DES_bs_all_KS_p(%rip),k_ptr
	movdqa zero,B(0)
	movdqa zero,B(1)
	movdqa zero,B(2)
	movdqa zero,B(3)
	movdqa zero,B(4)
	movdqa zero,B(5)
	movdqa zero,B(6)
	movdqa zero,B(7)
	movdqa ones,B(8)
	movdqa ones,B(9)
	movdqa ones,B(10)
	movdqa zero,B(11)
	movdqa ones,B(12)
	movdqa zero,B(13)
	movdqa zero,B(14)
	movdqa zero,B(15)
	movdqa zero,B(16)
	movdqa zero,B(17)
	movdqa zero,B(18)
	movdqa zero,B(19)
	movdqa zero,B(20)
	movdqa zero,B(21)
	movdqa zero,B(22)
	movdqa ones,B(23)
	movdqa zero,B(24)
	movdqa zero,B(25)
	movdqa ones,B(26)
	movdqa zero,B(27)
	movdqa zero,B(28)
	movdqa ones,B(29)
	movdqa ones,B(30)
	movdqa ones,B(31)
	movdqa zero,B(32)
	movdqa zero,B(33)
	movdqa zero,B(34)
	movdqa ones,B(35)
	movdqa zero,B(36)
	movdqa ones,B(37)
	movdqa ones,B(38)
	movdqa ones,B(39)
	movdqa zero,B(40)
	movdqa zero,B(41)
	movdqa zero,B(42)
	movdqa zero,B(43)
	movdqa zero,B(44)
	movdqa ones,B(45)
	movdqa zero,B(46)
	movdqa zero,B(47)
	movdqa ones,B(48)
	movdqa ones,B(49)
	movdqa zero,B(50)
	movdqa zero,B(51)
	movdqa zero,B(52)
	movdqa zero,B(53)
	movdqa ones,B(54)
	movdqa zero,B(55)
	movdqa ones,B(56)
	movdqa zero,B(57)
	movdqa ones,B(58)
	movdqa zero,B(59)
	movdqa ones,B(60)
	movdqa ones,B(61)
	movdqa ones,B(62)
	movdqa ones,B(63)
	movl $8,rounds
DES_bs_crypt_LM_loop:
	xor_B_KS_p_special(31, 0, 0, 1, 1, 2, 2, 3, 3, 4, 5)
	S1(B(40), B(48), B(54), B(62))
	xor_B_KS_p(3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11)
	S2(B(44), B(59), B(33), B(49))
	xor_B_KS_p(7, 12, 8, 13, 9, 14, 10, 15, 11, 16, 12, 17)
	S3(B(55), B(47), B(61), B(37))
	xor_B_KS_p(11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 23)
	S4(B(57), B(51), B(41), B(32))
	xor_B_KS_p(15, 24, 16, 25, 17, 26, 18, 27, 19, 28, 20, 29)
	S5(B(39), B(45), B(56), B(34))
	xor_B_KS_p(19, 30, 20, 31, 21, 32, 22, 33, 23, 34, 24, 35)
	S6(B(35), B(60), B(42), B(50))
	xor_B_KS_p(23, 36, 24, 37, 25, 38, 26, 39, 27, 40, 28, 41)
	S7(B(63), B(43), B(53), B(38))
	xor_B_KS_p(27, 42, 28, 43, 29, 44, 30, 45, 31, 46, 0, 47)
	S8(B(36), B(58), B(46), B(52))
	xor_B_KS_p_special(63, 48, 32, 49, 33, 50, 34, 51, 35, 52, 53)
	S1(B(8), B(16), B(22), B(30))
	xor_B_KS_p(35, 54, 36, 55, 37, 56, 38, 57, 39, 58, 40, 59)
	S2(B(12), B(27), B(1), B(17))
	xor_B_KS_p(39, 60, 40, 61, 41, 62, 42, 63, 43, 64, 44, 65)
	S3(B(23), B(15), B(29), B(5))
	xor_B_KS_p(43, 66, 44, 67, 45, 68, 46, 69, 47, 70, 48, 71)
	S4(B(25), B(19), B(9), B(0))
	xor_B_KS_p(47, 72, 48, 73, 49, 74, 50, 75, 51, 76, 52, 77)
	S5(B(7), B(13), B(24), B(2))
	xor_B_KS_p(51, 78, 52, 79, 53, 80, 54, 81, 55, 82, 56, 83)
	S6(B(3), B(28), B(10), B(18))
	xor_B_KS_p(55, 84, 56, 85, 57, 86, 58, 87, 59, 88, 60, 89)
	S7(B(31), B(11), B(21), B(6))
	xor_B_KS_p(59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 32, 95)
	addq $nptr(96),k_ptr
	S8(B(4), B(26), B(14), B(20))
	subl $1,rounds
	jnz DES_bs_crypt_LM_loop
	xchgq %r8,%rax
	ret

#endif

#if CPU_REQ
/*
 * CPU detection.
 */

/* Leaf 1 */
#define CF_XSAVE_OSXSAVE_AVX		$0x1C000000

/* Extended features */
#define CX_XOP				$0x00000800

/* Leaf 7 */
#define C7_AVX2				$0x00000020
#define C7_AVX512F			$0x00010000

.text

#ifdef UNDERSCORES
#define CPU_detect _CPU_detect
#endif
.globl CPU_detect
CPU_detect:
	pushq %rbx

/* First, leaf 1 checks */
	movl $1,%eax
	cpuid
	andl CF_XSAVE_OSXSAVE_AVX,%ecx
	cmpl CF_XSAVE_OSXSAVE_AVX,%ecx
	jne CPU_detect_fail

/* Check that state is preserved on a context switch */
	xorl %ecx,%ecx
	xgetbv
	andb $0x6,%al
	cmpb $0x6,%al
	jne CPU_detect_fail

/* Extended feature tests (if required) */
#ifdef CPU_REQ_XOP
	movl $0x80000000,%eax
	cpuid
	movl $0x80000001,%edx
	cmpl %edx,%eax
	jl CPU_detect_fail
	xchgl %edx,%eax
	cpuid
	testl CX_XOP,%ecx
	jz CPU_detect_fail
#endif

/* Finally, leaf 7 tests (if required) */
#if defined(CPU_REQ_AVX2) || defined(CPU_REQ_AVX512F)
	xorl %eax,%eax
	cpuid
	movl $7,%edx
	cmpl %edx,%eax
	jl CPU_detect_fail
	xchgl %edx,%eax
	xorl %ecx,%ecx
	cpuid
	testl C7_AVX2,%ebx
	jz CPU_detect_fail
#endif
#ifdef CPU_REQ_AVX512F
	testl C7_AVX512F,%ebx
	jz CPU_detect_fail
#endif

/* If we reached here all is fine and we return 1 */
	movl $1,%eax
	popq %rbx
	ret

/* Return 0 */
CPU_detect_fail:
	xorl %eax,%eax
	popq %rbx
	ret
#endif

#if defined(__ELF__) && defined(__linux__)
.section .note.GNU-stack,"",@progbits
#endif
